Integration¶

Sanorama Hie et al., 2019
GitHub
Tutorial external API
External external API tutorial

A fix to run scran pooling normalization computeSumFactors in current python environment.

In [1]:
import scanpy as sc
import scanorama

import numpy as np
import pandas as pd

import os
In [2]:
# Working directory 
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')
In [3]:
# rpy2 
os.environ['R_HOME'] = '/home/fdeckert/bin/miniconda3/envs/p.3.8.12-FD20200109SPLENO/lib/R'

# Communication 
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) # Ignore R warnings

# Convertion 
from rpy2.robjects import pandas2ri
import anndata2ri

# Activate conversion globaly 
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
In [4]:
%%R
library(scran)
In [5]:
# Plotting 
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

sc.set_figure_params(figsize=(5, 5))

Import AnnData¶

In [6]:
adata = sc.read_h5ad('data/object/qc.h5ad')
adata = adata[adata.obs['doublet_class']=='Singlet'].copy()
In [7]:
def set_color(categories): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]

    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)
        
# Set colors
set_color(list(color.keys()))

Subset adata by sample_group¶

In [8]:
adata_sub = dict()
for sample_group in adata.obs['sample_group'].unique():
    adata_sub[sample_group] = adata[adata.obs['sample_group']==sample_group].copy()
adata_sub = list(adata_sub.values())

Compute scran size factor¶

In [9]:
def scran_input(adata):
    
    adata_pp = adata.copy()
    
    # For computeSumFactors we need a good coverage for each gene
    sc.pp.filter_genes(adata_pp, min_cells=20) 
    
    # Compute groups 
    sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
    sc.pp.log1p(adata_pp)
    sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack')
    sc.pp.neighbors(adata_pp)
    sc.tl.louvain(adata_pp, key_added='groups', resolution=1)
    
    input_groups = adata_pp.obs['groups']
    
    # Set count matrix
    data_mat = adata.X.T
    
    data = {'input_groups': input_groups, 'data_mat': data_mat}
    
    return(data)
In [10]:
scran_input = [scran_input(x) for x in adata_sub]

NaCl_Rep2¶

In [11]:
data_mat = scran_input[0]['data_mat']
input_groups = list(scran_input[0]['input_groups'])
In [12]:
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
In [13]:
adata_sub[0].obs['size_factors'] = size_factors

NaCl_Rep1¶

In [14]:
data_mat = scran_input[1]['data_mat']
input_groups = list(scran_input[1]['input_groups'])
In [15]:
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
In [16]:
adata_sub[1].obs['size_factors'] = size_factors

CpG_Rep2¶

In [17]:
data_mat = scran_input[2]['data_mat']
input_groups = list(scran_input[2]['input_groups'])
In [18]:
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
In [19]:
adata_sub[2].obs['size_factors'] = size_factors

CpG_Rep1¶

In [20]:
data_mat = scran_input[3]['data_mat']
input_groups = list(scran_input[3]['input_groups'])
In [21]:
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
In [22]:
adata_sub[3].obs['size_factors'] = size_factors

Normalize¶

In [23]:
adata_sub[0].X /= adata_sub[0].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[0])
In [24]:
adata_sub[1].X /= adata_sub[1].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[1])
In [25]:
adata_sub[2].X /= adata_sub[2].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[2])
In [26]:
adata_sub[3].X /= adata_sub[3].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[3])

Highly varialbe genes (HVG)¶

In [27]:
sc.pp.highly_variable_genes(adata_sub[0], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[1], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[2], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[3], flavor='cell_ranger', n_top_genes=5000)
In [28]:
hvg_0 = adata_sub[0].var_names[adata_sub[0].var.highly_variable]
hvg_1 = adata_sub[1].var_names[adata_sub[1].var.highly_variable]
hvg_2 = adata_sub[2].var_names[adata_sub[2].var.highly_variable]
hvg_3 = adata_sub[3].var_names[adata_sub[3].var.highly_variable]

Scale data¶

In [29]:
sc.pp.scale(adata_sub[0])
sc.pp.scale(adata_sub[1])
sc.pp.scale(adata_sub[2])
sc.pp.scale(adata_sub[3])

DEV¶

In [30]:
adata_set = adata_sub.copy()
adata_sub = adata_set.copy()

Scanorama¶

In [31]:
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)

# Concatenate scanorama output 
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)

obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)

# Add X_scanorama integration to adata 
adata.obsm["X_scanorama"] = X_scanorama

# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 14772 genes among all datasets
Processing datasets (0, 1)
Processing datasets (2, 3)
Processing datasets (1, 3)
Processing datasets (0, 2)
Processing datasets (0, 3)
Processing datasets (1, 2)

Scanorama (HVG)¶

In [32]:
adata_sub = adata_set.copy()
In [33]:
hvg = list(set(hvg_0) | set(hvg_1) | set(hvg_2) | set(hvg_3))
In [34]:
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
In [35]:
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)

# Concatenate scanorama output 
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)

obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)

# Add X_scanorama integration to adata 
adata.obsm["X_scanorama"] = X_scanorama

# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 8486 genes among all datasets
Processing datasets (2, 3)
Processing datasets (1, 3)
Processing datasets (0, 1)
Processing datasets (0, 2)
Processing datasets (0, 3)
Processing datasets (1, 2)

Scanorama (HVG)¶

In [36]:
adata_sub = adata_set.copy()
In [37]:
hvg = list(set(hvg_0) & set(hvg_1) & set(hvg_2) & set(hvg_3))
In [38]:
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
In [39]:
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)

# Concatenate scanorama output 
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)

obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)

# Add X_scanorama integration to adata 
adata.obsm["X_scanorama"] = X_scanorama

# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 2267 genes among all datasets
Processing datasets (1, 3)
Processing datasets (2, 3)
Processing datasets (0, 2)
Processing datasets (0, 1)
Processing datasets (0, 3)
Processing datasets (1, 2)

Scanorama (HVG)¶

In [40]:
adata_sub = adata_set.copy()
In [41]:
hvg_nacl = list(set(hvg_0) & set(hvg_1))
hvg_cpg = list(set(hvg_2) & set(hvg_3))
hvg = list(set(hvg_nacl) | set(hvg_cpg))
In [42]:
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
In [43]:
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)

# Concatenate scanorama output 
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)

obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)

# Add X_scanorama integration to adata 
adata.obsm["X_scanorama"] = X_scanorama

# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 4517 genes among all datasets
Processing datasets (2, 3)
Processing datasets (1, 3)
Processing datasets (0, 1)
Processing datasets (0, 2)
Processing datasets (0, 3)
Processing datasets (1, 2)

Save results¶

In [44]:
# adata.write('data/object/int.h5ad')
In [45]:
# import sys
# sys.path.insert(0, '../scFacility/script')
# from dirFacility import adata2dir
In [46]:
# adata = adata.raw.to_adata()
# adata.layers['counts'] = adata.X
# adata2dir(adata, 'data/object/int/', assay="RNA", layers="counts", build_dir=True, overwrite=True)